Fashion category prediction¶
This notebook predicts fashion categories based on bounding box coordinates using machine learning models with hyperparameter tuning and cross-validation.
In [3]:
pip install lightgbm
Requirement already satisfied: lightgbm in /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages (4.6.0) Requirement already satisfied: numpy>=1.17.0 in /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages (from lightgbm) (1.26.4) Requirement already satisfied: scipy in /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages (from lightgbm) (1.14.1) Note: you may need to restart the kernel to use updated packages.
In [4]:
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import time
from sklearn import svm
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.neural_network import MLPClassifier
import plotly.io as pio
pio.renderers.default = 'notebook'
import warnings
warnings.filterwarnings('ignore')
Data collection and exploratory analysis¶
In [5]:
# Load fashion data
with open('fashion.json', 'r') as f:
fashion_data = [json.loads(line) for line in f]
with open('fashion-cat.json', 'r') as f:
fashion_cat = json.load(f)
In [6]:
print(f"Total products: {len(fashion_data)}")
print(f"Total unique categories: {len(fashion_cat)}")
df = pd.DataFrame(fashion_data)
# Add category information
df['category'] = df['product'].map(fashion_cat)
df.head()
Total products: 72198 Total unique categories: 38111
Out[6]:
| product | scene | bbox | category | |
|---|---|---|---|---|
| 0 | 0027e30879ce3d87f82f699f148bff7e | cdab9160072dd1800038227960ff6467 | [0.434097, 0.859363, 0.560254, 1.0] | Apparel & Accessories|Shoes |
| 1 | 0027e30879ce3d87f82f699f148bff7e | 14f59334af4539132981b1324a731067 | [0.175269, 0.527773, 0.621485, 0.924899] | Apparel & Accessories|Shoes |
| 2 | 0027e30879ce3d87f82f699f148bff7e | e7d32df9f45b691afc580808750f73ca | [0.588666, 0.638503, 0.750647, 0.761368] | Apparel & Accessories|Shoes |
| 3 | 0027e30879ce3d87f82f699f148bff7e | c0be585ed21b1a6c6dc9559ebe007ede | [0.276699, 0.757741, 0.400485, 0.876138] | Apparel & Accessories|Shoes |
| 4 | 002a6586b8381b5efd39410657630b44 | 67ed2a06be8a26dc63d7a04d4e1a135f | [0.154545, 0.144809, 0.809091, 0.784153] | Apparel & Accessories|Handbags, Wallets & Cases |
In [7]:
print("Missing values in dataframe:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing values: {(df.isnull().sum().sum() / len(df) * 100):.2f}%")
if 'category' in df.columns:
missing_categories = df['category'].isnull().sum()
print(f"\nMissing categories: {missing_categories} ({missing_categories/len(df)*100:.2f}%)")
if 'bbox' in df.columns:
missing_bbox = df['bbox'].isnull().sum()
print(f"Missing bbox: {missing_bbox} ({missing_bbox/len(df)*100:.2f}%)")
Missing values in dataframe: product 0 scene 0 bbox 0 category 0 dtype: int64 Total missing values: 0 Percentage of missing values: 0.00% Missing categories: 0 (0.00%) Missing bbox: 0 (0.00%)
In [8]:
category_counts = df['category'].value_counts()
print(f"Number of unique categories: {len(category_counts)}")
print(f"\nTop 5 categories:")
print(category_counts.head(5))
fig = px.bar(
x=category_counts.head(5).index,
y=category_counts.head(5).values,
labels={'x': 'category', 'y': 'count'},
title='Top 5 categories by frequency'
)
fig.update_xaxes(tickangle=20)
fig.show()
Number of unique categories: 10 Top 5 categories: category Apparel & Accessories|Shoes 22706 Apparel & Accessories|Clothing|Pants 14289 Apparel & Accessories|Clothing|Shirts & Tops 11957 Apparel & Accessories|Handbags, Wallets & Cases 6322 Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 4788 Name: count, dtype: int64
Feature engineering¶
In [9]:
def extract_features(bbox):
x1, y1, x2, y2 = bbox
features = {
'x1': x1,
'y1': y1,
'x2': x2,
'y2': y2,
}
width = x2 - x1
height = y2 - y1
area = width * height
center_x = (x1 + x2) / 2
center_y = (y1 + y2) / 2
aspect_ratio = width / height
features.update({
'width': width,
'height': height,
'area': area,
'center_x': center_x,
'center_y': center_y,
'aspect_ratio': aspect_ratio,
'diagonal': np.sqrt(width**2 + height**2),
'perimeter': 2 * (width + height)
})
return features
bbox_features = df['bbox'].apply(extract_features)
feature_df = pd.DataFrame(list(bbox_features))
df_features = pd.concat([df[['product', 'category']], feature_df], axis=1)
print(f"Feature columns: {list(feature_df.columns)}")
df_features.head()
Feature columns: ['x1', 'y1', 'x2', 'y2', 'width', 'height', 'area', 'center_x', 'center_y', 'aspect_ratio', 'diagonal', 'perimeter']
Out[9]:
| product | category | x1 | y1 | x2 | y2 | width | height | area | center_x | center_y | aspect_ratio | diagonal | perimeter | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.434097 | 0.859363 | 0.560254 | 1.000000 | 0.126157 | 0.140637 | 0.017742 | 0.497175 | 0.929682 | 0.897040 | 0.188929 | 0.533588 |
| 1 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.175269 | 0.527773 | 0.621485 | 0.924899 | 0.446216 | 0.397126 | 0.177204 | 0.398377 | 0.726336 | 1.123613 | 0.597342 | 1.686684 |
| 2 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.588666 | 0.638503 | 0.750647 | 0.761368 | 0.161981 | 0.122865 | 0.019902 | 0.669656 | 0.699936 | 1.318366 | 0.203307 | 0.569692 |
| 3 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.276699 | 0.757741 | 0.400485 | 0.876138 | 0.123786 | 0.118397 | 0.014656 | 0.338592 | 0.816939 | 1.045516 | 0.171292 | 0.484366 |
| 4 | 002a6586b8381b5efd39410657630b44 | Apparel & Accessories|Handbags, Wallets & Cases | 0.154545 | 0.144809 | 0.809091 | 0.784153 | 0.654546 | 0.639344 | 0.418480 | 0.481818 | 0.464481 | 1.023777 | 0.914982 | 2.587780 |
In [10]:
fig = make_subplots(
rows=3, cols=3,
subplot_titles=('width', 'height', 'area', 'center X', 'center Y', 'aspect ratio',
'diagonal', 'perimeter', 'x1'),
specs=[[{"secondary_y": False} for _ in range(3)] for _ in range(3)]
)
features_to_plot = ['width', 'height', 'area', 'center_x', 'center_y',
'aspect_ratio', 'diagonal', 'perimeter', 'x1']
for idx, feature in enumerate(features_to_plot):
row = idx // 3 + 1
col = idx % 3 + 1
fig.add_trace(
go.Histogram(x=feature_df[feature], nbinsx=50, name=feature, showlegend=False),
row=row, col=col
)
fig.update_layout(height=900, title_text="feature distributions")
fig.show()
Random Uniform Classifier¶
In [11]:
class_labels = [
'Sunglasses',
'Coats & Jackets',
'Pants',
'Shirts & Tops',
'Shorts',
'Skirts',
'Handbags, Wallets & Cases',
'Earrings',
'Necklaces',
'Shoes'
]
# Equal probability for 10 classes
probabilities = [1/10] * 10 # 0.10 each
plt.figure(figsize=(10, 5))
# Pink color palette (light → medium → dark)
dark_pink = "#FF69B4" # you can change to "#FFC0CB" or "#C71585" if you prefer
plt.bar(class_labels, probabilities, color='pink')
# Formatting
plt.ylabel("Probability")
plt.title("Random Uniform Classifier")
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.ylim(0, 0.15)
# Horizontal line at 0.1 for emphasis
plt.axhline(0.1, color=dark_pink, linestyle="--", linewidth=1)
plt.tight_layout()
plt.show()
Pre-processing¶
In [12]:
feature_columns = ['x1', 'y1', 'x2', 'y2', 'width', 'height', 'area',
'center_x', 'center_y', 'aspect_ratio', 'diagonal', 'perimeter']
X = df_features[feature_columns].values
y = df_features['category'].values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"\nClass distribution:")
unique, counts = np.unique(y_encoded, return_counts=True)
for cls, count in zip(unique[:10], counts[:10]):
print(f"Class {cls} ({label_encoder.inverse_transform([cls])[0]}): {count} samples")
Number of features: 12 Number of samples: 72198 Number of classes: 10 Class distribution: Class 0 (Apparel & Accessories|Clothing Accessories|Sunglasses): 4577 samples Class 1 (Apparel & Accessories|Clothing|Outerwear|Coats & Jackets): 4788 samples Class 2 (Apparel & Accessories|Clothing|Pants): 14289 samples Class 3 (Apparel & Accessories|Clothing|Shirts & Tops): 11957 samples Class 4 (Apparel & Accessories|Clothing|Shorts): 2752 samples Class 5 (Apparel & Accessories|Clothing|Skirts): 1872 samples Class 6 (Apparel & Accessories|Handbags, Wallets & Cases): 6322 samples Class 7 (Apparel & Accessories|Jewelry|Earrings): 1507 samples Class 8 (Apparel & Accessories|Jewelry|Necklaces): 1428 samples Class 9 (Apparel & Accessories|Shoes): 22706 samples
In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y_encoded, test_size=0.2, stratify=y_encoded
)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Feature dimensions: {X_train.shape[1]}")
Training set size: 57758 Test set size: 14440 Feature dimensions: 12
Hyperparameter tuning and modeling¶
In [14]:
models = {
'random forest': {
'model': RandomForestClassifier(n_jobs=-1),
'params': {
'n_estimators': [50, 100],
'max_depth': [10, 20],
'min_samples_split': [2, 5]
}
},
'logistic regression': {
'model': LogisticRegression(max_iter=1000, n_jobs=-1),
'params': {
'C': [0.1, 1],
'solver': ['lbfgs', 'liblinear']
}
},
'Neural Network': {
'model': MLPClassifier(random_state=42, max_iter=500),
'params': {
'hidden_layer_sizes': [(50,), (100,)], # Reduced from [(50,), (100,), (50, 50)]
'alpha': [0.0001, 0.001], # Reduced from [0.0001, 0.001, 0.01]
'learning_rate': ['constant'] # Reduced from ['constant', 'adaptive']
}
},
'SVM': {
'model': svm.SVC(random_state=42),
'params': {
'C': [1], # Reduced from [0.1, 1, 10]
'kernel': ['rbf'] # Reduced from ['linear', 'rbf', 'poly', 'sigmoid']
}
},
}
In [15]:
best_models = {}
cv_results_all = {}
kfold = KFold(n_splits=3, shuffle=True)
print(f"Starting hyperparameter tuning with {kfold.n_splits}-fold CV")
print(f"Total models to train: {len(models)}\n")
for idx, (name, model_config) in enumerate(models.items(), 1):
start_time = time.time()
print(f"[{idx}/{len(models)}] Training {name}")
print(f"Start time: {datetime.now().strftime('%H:%M:%S')}")
param_grid = model_config['params']
n_combinations = 1
for param_values in param_grid.values():
n_combinations *= len(param_values)
total_fits = n_combinations * kfold.n_splits
print(f"Parameter combinations: {n_combinations}")
print(f"Total fits (combinations × CV folds): {total_fits}")
print(f"Testing parameters: {param_grid}")
grid_search = GridSearchCV(
model_config['model'],
model_config['params'],
cv=kfold,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
best_models[name] = grid_search.best_estimator_
cv_results_all[name] = {
'best_params': grid_search.best_params_,
'best_score': grid_search.best_score_,
'cv_results': grid_search.cv_results_
}
print(f"\n{name} completed in {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score (accuracy): {grid_search.best_score_:.4f}")
print(f"End time: {datetime.now().strftime('%H:%M:%S')}")
Starting hyperparameter tuning with 3-fold CV
Total models to train: 4
[1/4] Training random forest
Start time: 19:13:31
Parameter combinations: 8
Total fits (combinations × CV folds): 24
Testing parameters: {'n_estimators': [50, 100], 'max_depth': [10, 20], 'min_samples_split': [2, 5]}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
random forest completed in 28.0 seconds (0.5 minutes)
Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score (accuracy): 0.6391
End time: 19:13:59
[2/4] Training logistic regression
Start time: 19:13:59
Parameter combinations: 4
Total fits (combinations × CV folds): 12
Testing parameters: {'C': [0.1, 1], 'solver': ['lbfgs', 'liblinear']}
Fitting 3 folds for each of 4 candidates, totalling 12 fits
/Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn(
logistic regression completed in 5.4 seconds (0.1 minutes)
Best parameters: {'C': 1, 'solver': 'lbfgs'}
Best CV score (accuracy): 0.4963
End time: 19:14:04
[3/4] Training Neural Network
Start time: 19:14:04
Parameter combinations: 4
Total fits (combinations × CV folds): 12
Testing parameters: {'hidden_layer_sizes': [(50,), (100,)], 'alpha': [0.0001, 0.001], 'learning_rate': ['constant']}
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Neural Network completed in 57.4 seconds (1.0 minutes)
Best parameters: {'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
Best CV score (accuracy): 0.5503
End time: 19:15:02
[4/4] Training SVM
Start time: 19:15:02
Parameter combinations: 1
Total fits (combinations × CV folds): 3
Testing parameters: {'C': [1], 'kernel': ['rbf']}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM completed in 165.0 seconds (2.7 minutes)
Best parameters: {'C': 1, 'kernel': 'rbf'}
Best CV score (accuracy): 0.5400
End time: 19:17:46
Cross-validation results¶
In [16]:
cv_scores_data = []
for name, results in cv_results_all.items():
cv_results = results['cv_results']
mean_scores = cv_results['mean_test_score']
std_scores = cv_results['std_test_score']
param_names = list(cv_results['params'][0].keys())
for i, params in enumerate(cv_results['params']):
param_str = ', '.join([f"{k}={v}" for k, v in params.items()])
cv_scores_data.append({
'Model': name,
'Parameters': param_str[:50] + '...' if len(param_str) > 50 else param_str,
'Accuracy': mean_scores[i],
'Std': std_scores[i]
})
cv_df = pd.DataFrame(cv_scores_data)
fig = go.Figure()
for model_name in cv_df['Model'].unique():
model_data = cv_df[cv_df['Model'] == model_name].sort_values('Accuracy', ascending=False)
fig.add_trace(go.Scatter(
x=model_data['Parameters'],
y=model_data['Accuracy'],
error_y=dict(type='data', array=model_data['Std']),
mode='markers+lines',
name=model_name,
text=model_data['Parameters'],
hovertemplate='<b>%{text}</b><br>Accuracy: %{y:.4f}<br>Std: %{customdata:.4f}<extra></extra>',
customdata=model_data['Std']
))
fig.update_layout(
title='Cross-validation Scores',
xaxis_title='Hyperparameter Configuration',
yaxis_title='Accuracy',
height=600,
xaxis=dict(tickangle=45, tickmode='array', tickvals=[])
)
fig.show()
In [17]:
best_scores = {name: results['best_score'] for name, results in cv_results_all.items()}
fig = go.Figure(data=[
go.Bar(
x=list(best_scores.keys()),
y=list(best_scores.values()),
text=[f'{v:.4f}' for v in best_scores.values()],
textposition='auto',
marker_color='pink'
)
])
fig.update_layout(
title='best cross-validation accuracy scores',
xaxis_title='model',
yaxis_title='accuracy',
height=500
)
fig.show()
print("best cross-validation accuracy scores:")
for name, score in sorted(best_scores.items(), key=lambda x: x[1], reverse=True):
print(f" {name}: {score:.4f}")
best cross-validation accuracy scores: random forest: 0.6391 Neural Network: 0.5503 SVM: 0.5400 logistic regression: 0.4963
Training and test accuracies¶
In [18]:
train_results = {}
test_results = {}
for name, model in best_models.items():
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
test_results[name] = {
'accuracy': accuracy,
'mse': mse,
'predictions': y_pred
}
for name, model in best_models.items():
# Predictions on training set
y_train_pred = model.predict(X_train)
# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_results[name] = {
'accuracy': train_accuracy,
'mse': train_mse
}
print(f"{name}:")
print(f"training accuracy: {train_accuracy:.4f}")
print(f"test accuracy: {test_results[name]['accuracy']:.4f}")
print(f"difference: {train_accuracy - test_results[name]['accuracy']:.4f}\n")
random forest: training accuracy: 0.8293 test accuracy: 0.6600 difference: 0.1694 logistic regression: training accuracy: 0.4972 test accuracy: 0.4984 difference: -0.0012 Neural Network: training accuracy: 0.5580 test accuracy: 0.5545 difference: 0.0035 SVM: training accuracy: 0.5459 test accuracy: 0.5445 difference: 0.0014
In [19]:
model_names = list(test_results.keys())
train_accuracies = [train_results[name]['accuracy'] for name in model_names]
test_accuracies = [test_results[name]['accuracy'] for name in model_names]
fig = go.Figure()
fig.add_trace(go.Bar(
x=model_names,
y=train_accuracies,
name='Training Accuracy',
marker_color='maroon',
text=[f'{v:.4f}' for v in train_accuracies],
textposition='auto'
))
fig.add_trace(go.Bar(
x=model_names,
y=test_accuracies,
name='Test Accuracy',
marker_color='pink',
text=[f'{v:.4f}' for v in test_accuracies],
textposition='auto'
))
fig.update_layout(
title='Training vs Test Accuracy Comparison',
xaxis_title='Model',
yaxis_title='Accuracy',
barmode='group',
height=500,
yaxis=dict(range=[0, 1])
)
fig.show()
for name in model_names:
gap = train_results[name]['accuracy'] - test_results[name]['accuracy']
print(f"{name}: {gap:+.4f} ({'Overfitting' if gap > 0.05 else 'Good generalization' if gap < 0.02 else 'Moderate gap'})")
random forest: +0.1694 (Overfitting) logistic regression: -0.0012 (Good generalization) Neural Network: +0.0035 (Good generalization) SVM: +0.0014 (Good generalization)
In [20]:
test_accuracies = [test_results[name]['accuracy'] for name in test_results.keys()]
test_mses = [test_results[name]['mse'] for name in test_results.keys()]
fig = make_subplots(
rows=1, cols=2,
subplot_titles=('test accuracy', 'Test MSE'),
specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)
fig.add_trace(
go.Bar(x=list(test_results.keys()), y=test_accuracies,
text=[f'{v:.4f}' for v in test_accuracies], textposition='auto',
marker_color='lightgreen', name='accuracy'),
row=1, col=1
)
fig.add_trace(
go.Bar(x=list(test_results.keys()), y=test_mses,
text=[f'{v:.4f}' for v in test_mses], textposition='auto',
marker_color='pink', name='MSE'),
row=1, col=2
)
fig.update_layout(height=500, title_text="model performance on test set", showlegend=False)
fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=1, col=2)
fig.update_yaxes(title_text="accuracy", row=1, col=1)
fig.update_yaxes(title_text="MSE", row=1, col=2)
fig.show()
In [21]:
test_results
Out[21]:
{'random forest': {'accuracy': 0.659972299168975,
'mse': 7.622229916897507,
'predictions': array([3, 9, 3, ..., 1, 9, 7])},
'logistic regression': {'accuracy': 0.4984072022160665,
'mse': 12.402285318559557,
'predictions': array([3, 9, 0, ..., 3, 9, 7])},
'Neural Network': {'accuracy': 0.5545013850415512,
'mse': 10.240789473684211,
'predictions': array([3, 9, 0, ..., 3, 9, 7])},
'SVM': {'accuracy': 0.5445290858725762,
'mse': 10.865789473684211,
'predictions': array([3, 9, 0, ..., 3, 9, 7])}}
Hyperparameter Tuning and Results¶
In [22]:
models
Out[22]:
{'random forest': {'model': RandomForestClassifier(n_jobs=-1),
'params': {'n_estimators': [50, 100],
'max_depth': [10, 20],
'min_samples_split': [2, 5]}},
'logistic regression': {'model': LogisticRegression(max_iter=1000, n_jobs=-1),
'params': {'C': [0.1, 1], 'solver': ['lbfgs', 'liblinear']}},
'Neural Network': {'model': MLPClassifier(max_iter=500, random_state=42),
'params': {'hidden_layer_sizes': [(50,), (100,)],
'alpha': [0.0001, 0.001],
'learning_rate': ['constant']}},
'SVM': {'model': SVC(random_state=42),
'params': {'C': [1], 'kernel': ['rbf']}}}
get feature importance for logistic regression, SVM, and Neural Network models
In [23]:
best_models
Out[23]:
{'random forest': RandomForestClassifier(max_depth=20, min_samples_split=5, n_jobs=-1),
'logistic regression': LogisticRegression(C=1, max_iter=1000, n_jobs=-1),
'Neural Network': MLPClassifier(alpha=0.001, max_iter=500, random_state=42),
'SVM': SVC(C=1, random_state=42)}
Classification Reports¶
In [24]:
best_model_name = max(test_results.keys(), key=lambda x: test_results[x]['accuracy'])
best_model = best_models[best_model_name]
print(f"Best model: {best_model_name}")
print(f"Test accuracy: {test_results[best_model_name]['accuracy']:.4f}")
print(f"Test mse: {test_results[best_model_name]['mse']:.4f}")
y_pred_best = test_results[best_model_name]['predictions']
print(f"\nClassification report for {best_model_name}:")
print(classification_report(y_test, y_pred_best,
target_names=[label_encoder.inverse_transform([i])[0]
for i in range(len(label_encoder.classes_))]))
Best model: random forest
Test accuracy: 0.6600
Test mse: 7.6222
Classification report for random forest:
precision recall f1-score support
Apparel & Accessories|Clothing Accessories|Sunglasses 0.72 0.68 0.70 915
Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 0.55 0.34 0.42 958
Apparel & Accessories|Clothing|Pants 0.71 0.79 0.75 2858
Apparel & Accessories|Clothing|Shirts & Tops 0.53 0.56 0.55 2392
Apparel & Accessories|Clothing|Shorts 0.69 0.43 0.53 550
Apparel & Accessories|Clothing|Skirts 0.68 0.22 0.33 374
Apparel & Accessories|Handbags, Wallets & Cases 0.59 0.48 0.53 1265
Apparel & Accessories|Jewelry|Earrings 0.70 0.46 0.55 301
Apparel & Accessories|Jewelry|Necklaces 0.69 0.46 0.55 286
Apparel & Accessories|Shoes 0.70 0.84 0.76 4541
accuracy 0.66 14440
macro avg 0.66 0.52 0.57 14440
weighted avg 0.66 0.66 0.65 14440
In [25]:
for i in test_results:
y_pred_best = test_results[i]['predictions']
print(f"\nClassification report for {i.title()}:")
print(classification_report(y_test, y_pred_best,
target_names=[label_encoder.inverse_transform([i])[0]
for i in range(len(label_encoder.classes_))]))
Classification report for Random Forest:
precision recall f1-score support
Apparel & Accessories|Clothing Accessories|Sunglasses 0.72 0.68 0.70 915
Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 0.55 0.34 0.42 958
Apparel & Accessories|Clothing|Pants 0.71 0.79 0.75 2858
Apparel & Accessories|Clothing|Shirts & Tops 0.53 0.56 0.55 2392
Apparel & Accessories|Clothing|Shorts 0.69 0.43 0.53 550
Apparel & Accessories|Clothing|Skirts 0.68 0.22 0.33 374
Apparel & Accessories|Handbags, Wallets & Cases 0.59 0.48 0.53 1265
Apparel & Accessories|Jewelry|Earrings 0.70 0.46 0.55 301
Apparel & Accessories|Jewelry|Necklaces 0.69 0.46 0.55 286
Apparel & Accessories|Shoes 0.70 0.84 0.76 4541
accuracy 0.66 14440
macro avg 0.66 0.52 0.57 14440
weighted avg 0.66 0.66 0.65 14440
Classification report for Logistic Regression:
precision recall f1-score support
Apparel & Accessories|Clothing Accessories|Sunglasses 0.62 0.57 0.59 915
Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 0.00 0.00 0.00 958
Apparel & Accessories|Clothing|Pants 0.51 0.73 0.60 2858
Apparel & Accessories|Clothing|Shirts & Tops 0.37 0.51 0.43 2392
Apparel & Accessories|Clothing|Shorts 0.00 0.00 0.00 550
Apparel & Accessories|Clothing|Skirts 0.00 0.00 0.00 374
Apparel & Accessories|Handbags, Wallets & Cases 0.49 0.03 0.06 1265
Apparel & Accessories|Jewelry|Earrings 0.68 0.17 0.28 301
Apparel & Accessories|Jewelry|Necklaces 0.08 0.00 0.01 286
Apparel & Accessories|Shoes 0.54 0.72 0.62 4541
accuracy 0.50 14440
macro avg 0.33 0.27 0.26 14440
weighted avg 0.43 0.50 0.43 14440
Classification report for Neural Network:
precision recall f1-score support
Apparel & Accessories|Clothing Accessories|Sunglasses 0.70 0.60 0.65 915
Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 0.36 0.06 0.10 958
Apparel & Accessories|Clothing|Pants 0.60 0.75 0.67 2858
Apparel & Accessories|Clothing|Shirts & Tops 0.41 0.46 0.43 2392
Apparel & Accessories|Clothing|Shorts 0.46 0.19 0.27 550
Apparel & Accessories|Clothing|Skirts 0.00 0.00 0.00 374
Apparel & Accessories|Handbags, Wallets & Cases 0.41 0.34 0.37 1265
Apparel & Accessories|Jewelry|Earrings 0.61 0.19 0.29 301
Apparel & Accessories|Jewelry|Necklaces 0.40 0.23 0.30 286
Apparel & Accessories|Shoes 0.61 0.77 0.68 4541
accuracy 0.55 14440
macro avg 0.46 0.36 0.38 14440
weighted avg 0.52 0.55 0.52 14440
Classification report for Svm:
precision recall f1-score support
Apparel & Accessories|Clothing Accessories|Sunglasses 0.69 0.58 0.63 915
Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 0.25 0.00 0.00 958
Apparel & Accessories|Clothing|Pants 0.60 0.74 0.66 2858
Apparel & Accessories|Clothing|Shirts & Tops 0.39 0.48 0.43 2392
Apparel & Accessories|Clothing|Shorts 0.54 0.14 0.22 550
Apparel & Accessories|Clothing|Skirts 0.00 0.00 0.00 374
Apparel & Accessories|Handbags, Wallets & Cases 0.44 0.23 0.30 1265
Apparel & Accessories|Jewelry|Earrings 0.75 0.17 0.28 301
Apparel & Accessories|Jewelry|Necklaces 0.31 0.10 0.15 286
Apparel & Accessories|Shoes 0.58 0.79 0.67 4541
accuracy 0.54 14440
macro avg 0.45 0.32 0.34 14440
weighted avg 0.51 0.54 0.50 14440
Feature importance¶
In [26]:
tree_models = ['random forest']
for model_name in tree_models:
if model_name in best_models:
model = best_models[model_name]
if hasattr(model, 'feature_importances_'):
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
'feature': feature_columns,
'importance': importances
}).sort_values('importance', ascending=False)
fig = go.Figure(data=[
go.Bar(
x=feature_importance_df['importance'],
y=feature_importance_df['feature'],
orientation='h',
marker_color='pink'
)
])
fig.update_layout(
title=f'{model_name.title()} Feature Importance',
xaxis_title='Importance',
yaxis_title='Feature',
height=400
)
fig.show()
print(f"{model_name} top 5 most important features:")
print(feature_importance_df.head())
random forest top 5 most important features:
feature importance
1 y1 0.124657
8 center_y 0.118415
3 y2 0.099541
5 height 0.096828
9 aspect_ratio 0.080079
In [27]:
# Logistic Regression feature importance
model_name = 'logistic regression'
model = best_models[model_name]
print(f"Computing feature importance for: {model_name}")
importances = None
if hasattr(model, 'coef_'):
coef = model.coef_
if getattr(coef, 'ndim', 1) == 1:
importances = np.abs(coef)
else:
importances = np.mean(np.abs(coef), axis=0)
# Build dataframe and plot
feature_importance_df = pd.DataFrame({
'feature': feature_columns,
'importance': importances
}).sort_values('importance', ascending=False)
fig = go.Figure(data=[
go.Bar(
x=feature_importance_df['importance'],
y=feature_importance_df['feature'],
orientation='h',
marker_color='pink'
)
])
fig.update_layout(
title=f'{model_name.title()} Feature Importance',
xaxis_title='Importance',
yaxis_title='Feature',
height=400
)
fig.show()
print(f"{model_name} top 5 most important features:")
print(feature_importance_df.head())
Computing feature importance for: logistic regression
logistic regression top 5 most important features:
feature importance
10 diagonal 3.054276
11 perimeter 1.345586
5 height 1.311811
6 area 1.185332
4 width 1.167401
Neural Network Architecture¶
In [28]:
nn_model = best_models["Neural Network"] # or whatever you called it
n_inputs = nn_model.coefs_[0].shape[0]
hidden_sizes = nn_model.hidden_layer_sizes if isinstance(nn_model.hidden_layer_sizes, tuple) else (nn_model.hidden_layer_sizes,)
n_outputs = nn_model.coefs_[-1].shape[1]
layers = [n_inputs] + list(hidden_sizes) + [n_outputs]
labels = [
f"Input\n({n_inputs})",
*[f"Hidden {i+1}\n({h})" for i, h in enumerate(hidden_sizes)],
f"Output\n({n_outputs})"
]
pink_colors = ["#FFC0CB", "#FF69B4", "#C71585"]
plt.figure(figsize=(6, 2))
for i, (size, lab) in enumerate(zip(layers, labels)):
plt.scatter(i, 0, s=3000, marker='s', c =pink_colors[min(i, len(pink_colors)-1)])
plt.text(i, 0, lab, ha='center', va='center')
plt.axis('off')
plt.title("Network Architecture (Layers Only)")
plt.show()
SVMs¶
In [29]:
svm_model = best_models["SVM"] # or whatever you called it
classes = svm_model.classes_
support_counts = svm_model.n_support_
class_labels = ['Sunglasses', 'Coats & Jackets', 'Pants', 'Shirts & Tops', 'Shorts', 'Skirts', 'Handbags, Wallets & Cases', 'Earrings', 'Necklaces', 'Shoes']
plt.figure(figsize=(8,4))
plt.bar(class_labels, support_counts, color="#FF69B4")
plt.xticks(fontsize=8)
plt.title("Number of Support Vectors per Class")
plt.xlabel("Class")
plt.ylabel("Support Vector Count")
plt.xticks(rotation=45)
plt.show()
Summary¶
In [30]:
summary_data = {
'Model': list(test_results.keys()),
'CV_accuracy': [best_scores[name] for name in test_results.keys()],
'Test_accuracy': [test_results[name]['accuracy'] for name in test_results.keys()],
'Test_MSE': [test_results[name]['mse'] for name in test_results.keys()]
}
summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Test_accuracy', ascending=False)
print(summary_df.to_string(index=False))
print(f"\nbest model: {best_model_name}")
print(f"test accuracy: {test_results[best_model_name]['accuracy']:.4f}")
print(f"test mse: {test_results[best_model_name]['mse']:.4f}")
Model CV_accuracy Test_accuracy Test_MSE
random forest 0.639150 0.659972 7.622230
Neural Network 0.550348 0.554501 10.240789
SVM 0.539994 0.544529 10.865789
logistic regression 0.496278 0.498407 12.402285
best model: random forest
test accuracy: 0.6600
test mse: 7.6222